library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 3.0.0 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(Seurat)
library(dplyr)
pbmc.data <- Read10X(data.dir = "/Users/baigal/Desktop/vcftools/scRNA/filtered_feature_bc_matrix")
pbmc <- CreateSeuratObject(counts = pbmc.data, min.cells = 3, min.features = 200, project = "5k_PBMC", assay = "RNA")
dense.size <- object.size(x = as.matrix(x = pbmc.data))
dense.size
## 1350935128 bytes
sparse.size <- object.size(x = pbmc.data)
sparse.size
## 139163168 bytes
pbmc <- CreateSeuratObject(counts = pbmc.data, min.cells = 3, min.features = 200, project = "5k_PBMC", assay = "RNA")
pbmc
## An object of class Seurat
## 18791 features across 4962 samples within 1 assay
## Active assay: RNA (18791 features)
mito.genes <- grep(pattern = "^MT-", x = rownames(pbmc@assays[["RNA"]]), value = TRUE)
percent.mito <- Matrix::colSums(pbmc@assays[["RNA"]][mito.genes, ])/Matrix::colSums(pbmc@assays[["RNA"]])
pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")
VlnPlot(object = pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3)
par(mfrow = c(1, 2))
FeatureScatter(object = pbmc, feature1 = "nCount_RNA", feature2 = "percent.mito")
FeatureScatter(object = pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
pbmc <- subset(x = pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mito > -Inf & percent.mito < 0.25 )
pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- FindVariableFeatures(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, nfeatures = 2000)
top10 <- head(VariableFeatures(pbmc), 10)
plot1 <- VariableFeaturePlot(pbmc)
plot1
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
## When using repel, set xnudge and ynudge to 0 for optimal results
plot2
##Data Scaling
pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nCounts_RNA", "percent.mito"))
## Regressing out nCounts_RNA, percent.mito
## Centering and scaling data matrix
##PCA
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc), verbose = FALSE)
DimPlot(pbmc, reduction = "pca")
pbmc <- JackStraw(pbmc, num.replicate = 100, dims = 50)
pbmc <- ScoreJackStraw(pbmc, dims = 1:50)
JackStrawPlot(pbmc, dims = 1:30, reduction = "pca")
## Warning: Removed 44221 rows containing missing values (geom_point).
ElbowPlot(pbmc, ndims = 50)
pbmc <- FindNeighbors(pbmc, reduction = "pca", dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 0.5, algorithm = 1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 4595
## Number of edges: 166042
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8998
## Number of communities: 14
## Elapsed time: 0 seconds
pbmc <- RunTSNE(object = pbmc, dims.use = 1:20)
pbmc <- RunUMAP(pbmc, dims = 1:20)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 13:27:53 UMAP embedding parameters a = 0.9922 b = 1.112
## 13:27:53 Read 4595 rows and found 20 numeric columns
## 13:27:53 Using Annoy for neighbor search, n_neighbors = 30
## 13:27:53 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 13:27:54 Writing NN index file to temp file /var/folders/93/dtv9y3wx3_bfwbx4tfjq5q500000gn/T//RtmpHxpLjP/file184ca24e04ea6
## 13:27:54 Searching Annoy index using 1 thread, search_k = 3000
## 13:27:56 Annoy recall = 100%
## 13:27:56 Commencing smooth kNN distance calibration using 1 thread
## 13:27:57 Initializing from normalized Laplacian + noise
## 13:27:57 Commencing optimization for 500 epochs, with 192070 positive edges
## 13:28:06 Optimization finished
DimPlot(object = pbmc, reduction = "tsne")
DimPlot(pbmc, reduction = "umap", label = TRUE)
DimPlot(pbmc, reduction = "pca")
cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 1, min.pct = 0.25)
print(x = head(x = cluster1.markers, n = 5))
## p_val avg_logFC pct.1 pct.2 p_val_adj
## S100A8 0 4.143277 1.000 0.123 0
## S100A9 0 4.125020 1.000 0.233 0
## LYZ 0 3.202571 1.000 0.395 0
## VCAN 0 2.577781 0.992 0.071 0
## MNDA 0 2.564347 1.000 0.109 0
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
## # A tibble: 28 x 7
## # Groups: cluster [14]
## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 0. 0.943 0.856 0.227 0. 0 CCR7
## 2 0. 0.889 0.802 0.195 0. 0 TRABD2A
## 3 0. 4.14 1 0.123 0. 1 S100A8
## 4 0. 4.13 1 0.233 0. 1 S100A9
## 5 1.25e-224 1.26 0.925 0.43 2.35e-220 2 IL7R
## 6 2.46e-198 1.23 0.644 0.146 4.62e-194 2 KLRB1
## 7 1.29e-269 2.00 0.988 0.209 2.42e-265 3 CCL5
## 8 1.20e-164 1.61 0.576 0.085 2.26e-160 3 GZMK
## 9 0. 3.46 0.993 0.119 0. 4 GNLY
## 10 1.98e-288 2.90 1 0.224 3.72e-284 4 NKG7
## # … with 18 more rows
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
DoHeatmap(object = pbmc, features = top10$gene, label = TRUE)
## Warning in DoHeatmap(object = pbmc, features = top10$gene, label = TRUE): The
## following features were omitted as they were not found in the scale.data slot
## for the RNA assay: NUCB2, LUC7L3, CSKMT, INTS6, NKTR, HEXIM1, MT-ND5, MALAT1,
## MT-CYB, NOSIP, SARAF, TCF7, LDHB
##read the reference data
pbmc.10k<- readRDS("/Users/baigal/Desktop/vcftools/scRNA/pbmc10k/pbmc_10k_v3.rds")
p1<- DimPlot(pbmc.10k, group.by = "celltype", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("10k pbmc")
p2<- DimPlot(pbmc, group.by = "seurat_clusters", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("5k pbmc")
CombinePlots(plots = list(p1, p2))
## Warning: CombinePlots is being deprecated. Plots should now be combined using
## the patchwork system.
transfer.anchors <- FindTransferAnchors(reference = pbmc.10k, query = pbmc, features = VariableFeatures(object = pbmc.10k), reference.assay = "RNA", query.assay = "RNA", reduction = "pcaproject")
## Warning: Adding a Graph without an assay associated with it
## Warning: Adding a Graph without an assay associated with it
## Warning: Adding a Graph without an assay associated with it
## Warning: Adding a Graph without an assay associated with it
## Performing PCA on the provided reference using 3000 features as input.
## Projecting PCA
## Finding neighborhoods
## Finding anchors
## Found 7790 anchors
## Filtering anchors
## Retained 5062 anchors
## Extracting within-dataset neighbors
celltype.predictions <- TransferData(anchorset = transfer.anchors, refdata = pbmc.10k$celltype,
dims = 1:30)
## Finding integration vectors
## Finding integration vector weights
## Predicting cell labels
pbmc<- AddMetaData(pbmc, metadata = celltype.predictions)
DimPlot(pbmc, group.by = "predicted.id", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("5k pbmc")